Cleaning the Dataset

AccidentDS <- read.csv(file = '/Users/hamzausmani/CITS4009 Project 1 First Sem/us_data_2000.csv')

str(AccidentDS)
## 'data.frame':    2000 obs. of  57 variables:
##  $ MINE_ID            : int  100003 100003 100008 100011 100011 100011 100011 100011 100016 100021 ...
##  $ CONTROLLER_ID      : chr  "41044" "41044" "M31753" "M11763" ...
##  $ CONTROLLER_NAME    : chr  "Lhoist Group" "Lhoist Group" "Alan B  Cheney" "Imerys S A" ...
##  $ OPERATOR_ID        : chr  "L13586" "L13586" "L31753" "L17074" ...
##  $ OPERATOR_NAME      : chr  "Lhoist North America " "Lhoist North America " "Cheney Lime & Cement Company" "Imerys Pigments LLC" ...
##  $ CONTRACTOR_ID      : chr  "" "" "" "" ...
##  $ DOCUMENT_NO        : num  2.2e+11 2.2e+11 2.2e+11 2.2e+11 2.2e+11 ...
##  $ SUBUNIT_CD         : int  3 30 30 30 30 30 30 30 3 3 ...
##  $ SUBUNIT            : chr  "STRIP, QUARY, OPEN PIT" "MILL OPERATION/PREPARATION PLANT" "MILL OPERATION/PREPARATION PLANT" "MILL OPERATION/PREPARATION PLANT" ...
##  $ ACCIDENT_DT        : chr  "14/03/2012" "8/01/2007" "4/07/2009" "26/05/2000" ...
##  $ CAL_YR             : int  2012 2007 2009 2000 2005 2006 2008 2012 2000 2006 ...
##  $ CAL_QTR            : int  1 1 3 2 1 1 4 2 3 1 ...
##  $ FISCAL_YR          : int  2012 2007 2009 2000 2005 2006 2009 2012 2000 2006 ...
##  $ FISCAL_QTR         : int  2 2 4 3 2 2 1 3 4 2 ...
##  $ ACCIDENT_TIME      : int  945 1105 1000 1100 1430 1130 430 930 730 230 ...
##  $ DEGREE_INJURY_CD   : chr  "5" "6" "3" "5" ...
##  $ DEGREE_INJURY      : chr  "DAYS RESTRICTED ACTIVITY ONLY" "NO DYS AWY FRM WRK,NO RSTR ACT" "DAYS AWAY FROM WORK ONLY" "DAYS RESTRICTED ACTIVITY ONLY" ...
##  $ FIPS_STATE_CD      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ UG_LOCATION_CD     : chr  "?" "?" "?" "?" ...
##  $ UG_LOCATION        : chr  "NO VALUE FOUND" "NO VALUE FOUND" "NO VALUE FOUND" "NO VALUE FOUND" ...
##  $ UG_MINING_METHOD_CD: chr  "?" "?" "?" "?" ...
##  $ UG_MINING_METHOD   : chr  "NO VALUE FOUND" "NO VALUE FOUND" "NO VALUE FOUND" "NO VALUE FOUND" ...
##  $ MINING_EQUIP_CD    : chr  "24" "28" "?" "?" ...
##  $ MINING_EQUIP       : chr  "Front-end loader, Tractor-shovel, Payloader, Highlift, Skip loader" "Hand tools (not powered)" "NO VALUE FOUND" "NO VALUE FOUND" ...
##  $ EQUIP_MFR_CD       : chr  "119" "121" "?" "?" ...
##  $ EQUIP_MFR_NAME     : chr  "Not on this list" "Not Reported" "NO VALUE FOUND" "NO VALUE FOUND" ...
##  $ EQUIP_MODEL_NO     : chr  "22321" "" "" "?" ...
##  $ SHIFT_BEGIN_TIME   : int  600 700 600 700 700 700 2300 700 700 1800 ...
##  $ CLASSIFICATION_CD  : chr  "12" "10" "18" "9" ...
##  $ CLASSIFICATION     : chr  "POWERED HAULAGE" "HANDTOOLS (NONPOWERED)" "SLIP OR FALL OF PERSON" "HANDLING OF MATERIALS" ...
##  $ ACCIDENT_TYPE_CD   : chr  "21" "8" "30" "27" ...
##  $ ACCIDENT_TYPE      : chr  "CGHT I, U, B, MVNG & STTN OBJS" "STRUCK BY, NEC" "OVER-EXERTION, NEC" "OVER-EXERTION IN LIFTING OBJS" ...
##  $ NO_INJURIES        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ TOT_EXPER          : num  4.35 0.02 10 NA 0.87 ...
##  $ MINE_EXPER         : num  4.35 0.02 2.15 0.23 0.87 ...
##  $ JOB_EXPER          : num  0.67 0.02 2.15 0.23 0.38 ...
##  $ OCCUPATION_CD      : chr  "374" "374" "374" "374" ...
##  $ OCCUPATION         : chr  "Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator" "Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator" "Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator" "Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator" ...
##  $ ACTIVITY_CD        : chr  "28" "30" "13" "28" ...
##  $ ACTIVITY           : chr  "HANDLING SUPPLIES/MATERIALS" "HAND TOOLS (NOT POWERED)" "CLIMB SCAFFOLDS/LADDERS/PLATFORMS" "HANDLING SUPPLIES/MATERIALS" ...
##  $ INJURY_SOURCE_CD   : chr  "76" "46" "117" "4" ...
##  $ INJURY_SOURCE      : chr  "SURFACE MINING MACHINES" "AXE,HAMMER,SLEDGE" "GROUND" "BAGS" ...
##  $ NATURE_INJURY_CD   : chr  "160" "180" "330" "330" ...
##  $ NATURE_INJURY      : chr  "CONTUSN,BRUISE,INTAC SKIN" "CUT,LACER,PUNCT-OPN WOUND" "SPRAIN,STRAIN RUPT DISC" "SPRAIN,STRAIN RUPT DISC" ...
##  $ INJ_BODY_PART_CD   : chr  "700" "100" "520" "420" ...
##  $ INJ_BODY_PART      : chr  "MULTIPLE PARTS (MORE THAN ONE MAJOR)" "HEAD,NEC" "ANKLE" "BACK (MUSCLES/SPINE/S-CORD/TAILBONE)" ...
##  $ SCHEDULE_CHARGE    : int  0 0 0 NA 0 0 0 0 NA 0 ...
##  $ DAYS_RESTRICT      : int  8 0 0 5 5 3 0 21 10 19 ...
##  $ DAYS_LOST          : int  0 0 9 NA 0 0 0 0 NA 13 ...
##  $ TRANS_TERM         : chr  "N" "N" "N" "N" ...
##  $ RETURN_TO_WORK_DT  : chr  "03/26/2012" "1/09/2007" "07/14/2009" "6/01/2000" ...
##  $ IMMED_NOTIFY_CD    : chr  "? " "? " "? " "13" ...
##  $ IMMED_NOTIFY       : chr  "NO VALUE FOUND" "NO VALUE FOUND" "NO VALUE FOUND" "NOT MARKED" ...
##  $ INVEST_BEGIN_DT    : chr  "" "" "" "" ...
##  $ NARRATIVE          : chr  "Employee was cleaning up at the Primary Crusher with the Dingo skid steer. The employee slipped and fell while "| __truncated__ "Handle of sledgehammer broke and head of hammer hit employee in the forehead." "EMPLOYEE WAS CLIMBING DOWN A LADDER AND WHEN HE STEPPED TO THE GROUND HE SLIPPED AND SPRAINED HIS LEFT ANKLE." "HE PULLED A BACK MUSCLE WHILE STACKING BAGS OF MATERIAL." ...
##  $ CLOSED_DOC_NO      : num  NA NA 3.2e+11 3.2e+11 NA ...
##  $ COAL_METAL_IND     : chr  "M" "M" "M" "M" ...

#Identifying the number of nullpoints within the variables and the dimension of the dataset

apply(is.na(AccidentDS), 2, sum)
##             MINE_ID       CONTROLLER_ID     CONTROLLER_NAME         OPERATOR_ID 
##                   0                   0                   0                   0 
##       OPERATOR_NAME       CONTRACTOR_ID         DOCUMENT_NO          SUBUNIT_CD 
##                   0                   0                   0                   0 
##             SUBUNIT         ACCIDENT_DT              CAL_YR             CAL_QTR 
##                   0                   0                   0                   0 
##           FISCAL_YR          FISCAL_QTR       ACCIDENT_TIME    DEGREE_INJURY_CD 
##                   0                   0                   0                   0 
##       DEGREE_INJURY       FIPS_STATE_CD      UG_LOCATION_CD         UG_LOCATION 
##                   0                   0                   0                   0 
## UG_MINING_METHOD_CD    UG_MINING_METHOD     MINING_EQUIP_CD        MINING_EQUIP 
##                   0                   0                   0                   0 
##        EQUIP_MFR_CD      EQUIP_MFR_NAME      EQUIP_MODEL_NO    SHIFT_BEGIN_TIME 
##                   0                   0                   1                   9 
##   CLASSIFICATION_CD      CLASSIFICATION    ACCIDENT_TYPE_CD       ACCIDENT_TYPE 
##                   0                   0                   0                   0 
##         NO_INJURIES           TOT_EXPER          MINE_EXPER           JOB_EXPER 
##                   0                 360                 333                 328 
##       OCCUPATION_CD          OCCUPATION         ACTIVITY_CD            ACTIVITY 
##                   0                   0                   0                   0 
##    INJURY_SOURCE_CD       INJURY_SOURCE    NATURE_INJURY_CD       NATURE_INJURY 
##                   0                   0                   0                   0 
##    INJ_BODY_PART_CD       INJ_BODY_PART     SCHEDULE_CHARGE       DAYS_RESTRICT 
##                   0                   0                 673                 543 
##           DAYS_LOST          TRANS_TERM   RETURN_TO_WORK_DT     IMMED_NOTIFY_CD 
##                 422                   0                   0                   0 
##        IMMED_NOTIFY     INVEST_BEGIN_DT           NARRATIVE       CLOSED_DOC_NO 
##                   0                   0                   0                1138 
##      COAL_METAL_IND 
##                   0
dimension <- c(ncol(AccidentDS),nrow(AccidentDS))
dimension
## [1]   57 2000

There are 57 columns and 2000 rows

There are many unnecessary columns which should be removed in order to prevent hampering our visualization

RefinedColNames <- colnames(AccidentDS)

Before removing unnecessary columns there are many points to be noted There are many columns which have an equivalent number of same columns such as: CONTROLLER_ID is equivalent to CONTROLLER_NAME OPERATOR_ID is equivalent to OPERATOR_NAME SUBUNIT_CS is equivalent to SUBUNIT The year subset of ACCIDENT_DT is equivalent to CAL_YR DEGREE_INJURY_CD is equivalent to DEGREE_INJURY UG_LOCATION_CD is equivalent to UG_LOCATION UG_MINING_CD is equivalent to UG_MINING_METHOD MINING_EQUIP_CD is equivalent to MINING_EQUIP EQUIP_MFR_CD is equivalent to EQUIP_MFR_NAME CLASSIFICATION_CD is equivalent to CLASSIFICATION ACCIDENT_TYPE_CD is equivalent to ACCIDENT_TYPE OCCUPATION_CD is equivalent to OCCUPATION ACTIVITY_CD is equivalent to ACTIVITY INJURY_SOURCE_CD is equivalent to INJURY_SOURCE NATURE_INJURY_CD is equivalent to NATURE_INJURY INJ_BODY_PART_CD is equivalent to INJ_BODY_PART IMMED_NOTIFY_CD is equivalent to IMMED_NOTIFY

Now we are checking the columns that have missing values

DropColumns<- AccidentDS

drop <- c("MINE_ID","CONTROLLER_ID","OPERATOR_ID","DOCUMENT_NO","SUBUNIT_CD","ACCIDENT_DT","DEGREE_INJURY_CD","UG_LOCATION_CD","UG_MINING_METHOD_CD","MINING_EQUIP_CD","EQUIP_MFR_CD","CLASSIFICATION_CD","ACCIDENT_TYPE_CD","ACCIDENT_TYPE_CD","OCCUPATION_CD","ACTIVITY_CD","INJURY_SOURCE_CD","NATURE_INJURY_CD","INJ_BODY_PART_CD","IMMED_NOTIFY_CD","NARRATIVE","CLOSED_DOC_NO")

df = DropColumns[,!(names(DropColumns) %in% drop)]

head(df)
##   CONTROLLER_NAME                OPERATOR_NAME CONTRACTOR_ID
## 1    Lhoist Group        Lhoist North America               
## 2    Lhoist Group        Lhoist North America               
## 3  Alan B  Cheney Cheney Lime & Cement Company              
## 4      Imerys S A          Imerys Pigments LLC              
## 5      Imerys S A          Imerys Pigments LLC              
## 6      Imerys S A          Imerys Pigments LLC              
##                            SUBUNIT CAL_YR CAL_QTR FISCAL_YR FISCAL_QTR
## 1           STRIP, QUARY, OPEN PIT   2012       1      2012          2
## 2 MILL OPERATION/PREPARATION PLANT   2007       1      2007          2
## 3 MILL OPERATION/PREPARATION PLANT   2009       3      2009          4
## 4 MILL OPERATION/PREPARATION PLANT   2000       2      2000          3
## 5 MILL OPERATION/PREPARATION PLANT   2005       1      2005          2
## 6 MILL OPERATION/PREPARATION PLANT   2006       1      2006          2
##   ACCIDENT_TIME                  DEGREE_INJURY FIPS_STATE_CD    UG_LOCATION
## 1           945  DAYS RESTRICTED ACTIVITY ONLY             1 NO VALUE FOUND
## 2          1105 NO DYS AWY FRM WRK,NO RSTR ACT             1 NO VALUE FOUND
## 3          1000       DAYS AWAY FROM WORK ONLY             1 NO VALUE FOUND
## 4          1100  DAYS RESTRICTED ACTIVITY ONLY             1 NO VALUE FOUND
## 5          1430  DAYS RESTRICTED ACTIVITY ONLY             1 NO VALUE FOUND
## 6          1130  DAYS RESTRICTED ACTIVITY ONLY             1 NO VALUE FOUND
##   UG_MINING_METHOD
## 1   NO VALUE FOUND
## 2   NO VALUE FOUND
## 3   NO VALUE FOUND
## 4   NO VALUE FOUND
## 5   NO VALUE FOUND
## 6   NO VALUE FOUND
##                                                         MINING_EQUIP
## 1 Front-end loader, Tractor-shovel, Payloader, Highlift, Skip loader
## 2                                           Hand tools (not powered)
## 3                                                     NO VALUE FOUND
## 4                                                     NO VALUE FOUND
## 5                                                     NO VALUE FOUND
## 6                                                     NO VALUE FOUND
##     EQUIP_MFR_NAME EQUIP_MODEL_NO SHIFT_BEGIN_TIME         CLASSIFICATION
## 1 Not on this list          22321              600        POWERED HAULAGE
## 2     Not Reported                             700 HANDTOOLS (NONPOWERED)
## 3   NO VALUE FOUND                             600 SLIP OR FALL OF PERSON
## 4   NO VALUE FOUND              ?              700  HANDLING OF MATERIALS
## 5   NO VALUE FOUND                             700  HANDLING OF MATERIALS
## 6   NO VALUE FOUND                             700                  OTHER
##                    ACCIDENT_TYPE NO_INJURIES TOT_EXPER MINE_EXPER JOB_EXPER
## 1 CGHT I, U, B, MVNG & STTN OBJS           1      4.35       4.35      0.67
## 2                 STRUCK BY, NEC           1      0.02       0.02      0.02
## 3             OVER-EXERTION, NEC           1     10.00       2.15      2.15
## 4  OVER-EXERTION IN LIFTING OBJS           1        NA       0.23      0.23
## 5 ABSRTN RAD CAUST TXC & NOX SBS           1      0.87       0.87      0.38
## 6           BODILY REACTION, NEC           1      5.62       5.62      5.62
##                                                                                              OCCUPATION
## 1 Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator
## 2 Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator
## 3 Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator
## 4 Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator
## 5              Front-end loader, Scraper-loader operator, Pan operator, Payloader, Scraper rig operator
## 6     Maintenance man, Mechanic,  Repair/Serviceman, Boilermaker, Fueler, Tire tech, Field service tech
##                            ACTIVITY           INJURY_SOURCE
## 1       HANDLING SUPPLIES/MATERIALS SURFACE MINING MACHINES
## 2          HAND TOOLS (NOT POWERED)       AXE,HAMMER,SLEDGE
## 3 CLIMB SCAFFOLDS/LADDERS/PLATFORMS                  GROUND
## 4       HANDLING SUPPLIES/MATERIALS                    BAGS
## 5            WORKING WITH CHEMICALS ACIDS,ALKALI,WET CEMENT
## 6        MACHINE MAINTENANCE/REPAIR           BODILY MOTION
##               NATURE_INJURY                        INJ_BODY_PART
## 1 CONTUSN,BRUISE,INTAC SKIN MULTIPLE PARTS (MORE THAN ONE MAJOR)
## 2 CUT,LACER,PUNCT-OPN WOUND                             HEAD,NEC
## 3   SPRAIN,STRAIN RUPT DISC                                ANKLE
## 4   SPRAIN,STRAIN RUPT DISC BACK (MUSCLES/SPINE/S-CORD/TAILBONE)
## 5 BURN,CHEMICL-FUME,COMPOUN          HAND (NOT WRIST OR FINGERS)
## 6   SPRAIN,STRAIN RUPT DISC BACK (MUSCLES/SPINE/S-CORD/TAILBONE)
##   SCHEDULE_CHARGE DAYS_RESTRICT DAYS_LOST TRANS_TERM RETURN_TO_WORK_DT
## 1               0             8         0          N        03/26/2012
## 2               0             0         0          N         1/09/2007
## 3               0             0         9          N        07/14/2009
## 4              NA             5        NA          N         6/01/2000
## 5               0             5         0          N         2/01/2005
## 6               0             3         0          N         4/04/2006
##     IMMED_NOTIFY INVEST_BEGIN_DT COAL_METAL_IND
## 1 NO VALUE FOUND                              M
## 2 NO VALUE FOUND                              M
## 3 NO VALUE FOUND                              M
## 4     NOT MARKED                              M
## 5 NO VALUE FOUND      01/22/2005              M
## 6 NO VALUE FOUND                              M
colnames(df)
##  [1] "CONTROLLER_NAME"   "OPERATOR_NAME"     "CONTRACTOR_ID"    
##  [4] "SUBUNIT"           "CAL_YR"            "CAL_QTR"          
##  [7] "FISCAL_YR"         "FISCAL_QTR"        "ACCIDENT_TIME"    
## [10] "DEGREE_INJURY"     "FIPS_STATE_CD"     "UG_LOCATION"      
## [13] "UG_MINING_METHOD"  "MINING_EQUIP"      "EQUIP_MFR_NAME"   
## [16] "EQUIP_MODEL_NO"    "SHIFT_BEGIN_TIME"  "CLASSIFICATION"   
## [19] "ACCIDENT_TYPE"     "NO_INJURIES"       "TOT_EXPER"        
## [22] "MINE_EXPER"        "JOB_EXPER"         "OCCUPATION"       
## [25] "ACTIVITY"          "INJURY_SOURCE"     "NATURE_INJURY"    
## [28] "INJ_BODY_PART"     "SCHEDULE_CHARGE"   "DAYS_RESTRICT"    
## [31] "DAYS_LOST"         "TRANS_TERM"        "RETURN_TO_WORK_DT"
## [34] "IMMED_NOTIFY"      "INVEST_BEGIN_DT"   "COAL_METAL_IND"
#apply(is.na(df), 2, sum) including the non missing values

We need to convert the missing values to NA

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mutated_data <-
  mutate(df,
         CONTRACTOR_ID = na_if(CONTRACTOR_ID, ""),
         ACCIDENT_TIME = ifelse(ACCIDENT_TIME >2400, NA, ACCIDENT_TIME),
         UG_LOCATION = ifelse(UG_LOCATION =="NO VALUE FOUND", NA, UG_LOCATION),
         UG_MINING_METHOD = ifelse(UG_MINING_METHOD =="NO VALUE FOUND", NA, UG_MINING_METHOD),
         MINING_EQUIP = ifelse(MINING_EQUIP =="NO VALUE FOUND", NA, MINING_EQUIP),
         EQUIP_MFR_NAME = ifelse(EQUIP_MFR_NAME =="NO VALUE FOUND", NA, EQUIP_MFR_NAME),
         EQUIP_MODEL_NO = ifelse(EQUIP_MODEL_NO =="?" || EQUIP_MODEL_NO =="", NA, EQUIP_MODEL_NO),
         SHIFT_BEGIN_TIME = ifelse(SHIFT_BEGIN_TIME >2400, NA, SHIFT_BEGIN_TIME),
         TOT_EXPER = ifelse(TOT_EXPER =="", NA, TOT_EXPER),
         MINE_EXPER = ifelse(MINE_EXPER =="", NA, MINE_EXPER),
         JOB_EXPER = ifelse(JOB_EXPER =="", NA, JOB_EXPER),
         OCCUPATION = ifelse(OCCUPATION =="NO VALUE FOUND", NA, OCCUPATION),
         ACTIVITY = ifelse(ACTIVITY =="NO VALUE FOUND", NA, ACTIVITY),
         INJURY_SOURCE = ifelse(INJURY_SOURCE =="NO VALUE FOUND", NA, INJURY_SOURCE),
         NATURE_INJURY = ifelse(NATURE_INJURY =="NO VALUE FOUND", NA, NATURE_INJURY),
         INJ_BODY_PART = ifelse(INJ_BODY_PART =="NO VALUE FOUND", NA, INJ_BODY_PART),
         SCHEDULE_CHARGE = ifelse(SCHEDULE_CHARGE =="", NA, SCHEDULE_CHARGE),
         DAYS_RESTRICT = ifelse(DAYS_RESTRICT =="", NA, DAYS_RESTRICT),
         DAYS_LOST = ifelse(DAYS_LOST =="", NA, DAYS_LOST),
         TRANS_TERM = ifelse(TRANS_TERM =="", NA, TRANS_TERM),
         RETURN_TO_WORK_DT = ifelse(RETURN_TO_WORK_DT =="", NA, RETURN_TO_WORK_DT),
         IMMED_NOTIFY = ifelse(IMMED_NOTIFY =="NO VALUE FOUND", NA, IMMED_NOTIFY),
         INVEST_BEGIN_DT = ifelse(INVEST_BEGIN_DT =="", NA, INVEST_BEGIN_DT),
         COAL_METAL_IND = ifelse(COAL_METAL_IND =="", NA, COAL_METAL_IND)
    )
## Warning in EQUIP_MODEL_NO == "?" || EQUIP_MODEL_NO == "": 'length(x) = 2000 > 1'
## in coercion to 'logical(1)'

## Warning in EQUIP_MODEL_NO == "?" || EQUIP_MODEL_NO == "": 'length(x) = 2000 > 1'
## in coercion to 'logical(1)'

Checking the number of null values now after converting “?” and “VALUE NOT FOUND” to NA

count_missing <- function(df) {
sapply(df, FUN = function(col) sum(is.na(col)) )
}
nacounts <- count_missing(mutated_data)
hasNA = which(nacounts > 0)
nacounts[hasNA]
##     CONTRACTOR_ID     ACCIDENT_TIME       UG_LOCATION  UG_MINING_METHOD 
##              1830               139              1284              1392 
##      MINING_EQUIP    EQUIP_MFR_NAME  SHIFT_BEGIN_TIME         TOT_EXPER 
##              1053              1056                94               360 
##        MINE_EXPER         JOB_EXPER        OCCUPATION          ACTIVITY 
##               333               328               242               239 
##     INJURY_SOURCE     NATURE_INJURY     INJ_BODY_PART   SCHEDULE_CHARGE 
##               238               242               242               673 
##     DAYS_RESTRICT         DAYS_LOST        TRANS_TERM RETURN_TO_WORK_DT 
##               543               422               267               322 
##      IMMED_NOTIFY   INVEST_BEGIN_DT 
##              1139              1530

By looking at the dataset after replacing all the sentinel values with NA we see that CONTRACTOR_ID column has most of its values as NA so it is safe to drop it

colnames(mutated_data)
##  [1] "CONTROLLER_NAME"   "OPERATOR_NAME"     "CONTRACTOR_ID"    
##  [4] "SUBUNIT"           "CAL_YR"            "CAL_QTR"          
##  [7] "FISCAL_YR"         "FISCAL_QTR"        "ACCIDENT_TIME"    
## [10] "DEGREE_INJURY"     "FIPS_STATE_CD"     "UG_LOCATION"      
## [13] "UG_MINING_METHOD"  "MINING_EQUIP"      "EQUIP_MFR_NAME"   
## [16] "EQUIP_MODEL_NO"    "SHIFT_BEGIN_TIME"  "CLASSIFICATION"   
## [19] "ACCIDENT_TYPE"     "NO_INJURIES"       "TOT_EXPER"        
## [22] "MINE_EXPER"        "JOB_EXPER"         "OCCUPATION"       
## [25] "ACTIVITY"          "INJURY_SOURCE"     "NATURE_INJURY"    
## [28] "INJ_BODY_PART"     "SCHEDULE_CHARGE"   "DAYS_RESTRICT"    
## [31] "DAYS_LOST"         "TRANS_TERM"        "RETURN_TO_WORK_DT"
## [34] "IMMED_NOTIFY"      "INVEST_BEGIN_DT"   "COAL_METAL_IND"
DropColumns<- mutated_data
drop <- c("CONTRACTOR_ID")
mutated_data = DropColumns[,!(names(DropColumns) %in% drop)]
colnames(mutated_data)
##  [1] "CONTROLLER_NAME"   "OPERATOR_NAME"     "SUBUNIT"          
##  [4] "CAL_YR"            "CAL_QTR"           "FISCAL_YR"        
##  [7] "FISCAL_QTR"        "ACCIDENT_TIME"     "DEGREE_INJURY"    
## [10] "FIPS_STATE_CD"     "UG_LOCATION"       "UG_MINING_METHOD" 
## [13] "MINING_EQUIP"      "EQUIP_MFR_NAME"    "EQUIP_MODEL_NO"   
## [16] "SHIFT_BEGIN_TIME"  "CLASSIFICATION"    "ACCIDENT_TYPE"    
## [19] "NO_INJURIES"       "TOT_EXPER"         "MINE_EXPER"       
## [22] "JOB_EXPER"         "OCCUPATION"        "ACTIVITY"         
## [25] "INJURY_SOURCE"     "NATURE_INJURY"     "INJ_BODY_PART"    
## [28] "SCHEDULE_CHARGE"   "DAYS_RESTRICT"     "DAYS_LOST"        
## [31] "TRANS_TERM"        "RETURN_TO_WORK_DT" "IMMED_NOTIFY"     
## [34] "INVEST_BEGIN_DT"   "COAL_METAL_IND"
nacounts <- count_missing(mutated_data)
hasNA = which(nacounts > 0)
nacounts[hasNA]
##     ACCIDENT_TIME       UG_LOCATION  UG_MINING_METHOD      MINING_EQUIP 
##               139              1284              1392              1053 
##    EQUIP_MFR_NAME  SHIFT_BEGIN_TIME         TOT_EXPER        MINE_EXPER 
##              1056                94               360               333 
##         JOB_EXPER        OCCUPATION          ACTIVITY     INJURY_SOURCE 
##               328               242               239               238 
##     NATURE_INJURY     INJ_BODY_PART   SCHEDULE_CHARGE     DAYS_RESTRICT 
##               242               242               673               543 
##         DAYS_LOST        TRANS_TERM RETURN_TO_WORK_DT      IMMED_NOTIFY 
##               422               267               322              1139 
##   INVEST_BEGIN_DT 
##              1530
dimension <- c(ncol(mutated_data),nrow(mutated_data))
dimension
## [1]   35 2000

We see that column SHIFT_BEGIN_TIME has 94 null rows so it is safe to drop the rows

newdata <-
  mutated_data[!is.na(mutated_data$SHIFT_BEGIN_TIME),]
nrow(mutated_data)
## [1] 2000
nrow(newdata)
## [1] 1906
nacounts <- count_missing(newdata)
hasNA = which(nacounts > 0)
nacounts[hasNA]
##     ACCIDENT_TIME       UG_LOCATION  UG_MINING_METHOD      MINING_EQUIP 
##                91              1221              1319               984 
##    EQUIP_MFR_NAME         TOT_EXPER        MINE_EXPER         JOB_EXPER 
##               987               311               285               282 
##        OCCUPATION          ACTIVITY     INJURY_SOURCE     NATURE_INJURY 
##               211               209               209               210 
##     INJ_BODY_PART   SCHEDULE_CHARGE     DAYS_RESTRICT         DAYS_LOST 
##               210               619               499               386 
##        TRANS_TERM RETURN_TO_WORK_DT      IMMED_NOTIFY   INVEST_BEGIN_DT 
##               237               284              1111              1464
newdata2 <-  newdata[!is.na(newdata$ACCIDENT_TIME),]
nrow(mutated_data)
## [1] 2000
nrow(newdata2)
## [1] 1815

after removing the rows from ACCIDENT_TIME which were 87 rows and it was safe to remove them

dimension <- c(ncol(newdata2),nrow(newdata2))
dimension
## [1]   35 1815
nacounts <- count_missing(newdata2)
hasNA = which(nacounts > 0)
nacounts[hasNA]
##       UG_LOCATION  UG_MINING_METHOD      MINING_EQUIP    EQUIP_MFR_NAME 
##              1169              1265               921               923 
##         TOT_EXPER        MINE_EXPER         JOB_EXPER        OCCUPATION 
##               276               250               249               182 
##          ACTIVITY     INJURY_SOURCE     NATURE_INJURY     INJ_BODY_PART 
##               180               180               181               181 
##   SCHEDULE_CHARGE     DAYS_RESTRICT         DAYS_LOST        TRANS_TERM 
##               601               483               376               203 
## RETURN_TO_WORK_DT      IMMED_NOTIFY   INVEST_BEGIN_DT 
##               249              1056              1396

Now we replace the variables that numeric with the mean of the column

library(dplyr)
new_data3 <-
  mutate(newdata2,
         TOT_EXPER = ifelse(is.na(TOT_EXPER),format(round(mean(newdata2$TOT_EXPER, na.rm = TRUE), 3), nsmall = 3) , TOT_EXPER),
         MINE_EXPER = ifelse(is.na(MINE_EXPER), format(round(mean(newdata2$MINE_EXPER, na.rm = TRUE), 3), nsmall = 3), MINE_EXPER),
         JOB_EXPER = ifelse(is.na(JOB_EXPER), format(round(mean(newdata2$JOB_EXPER, na.rm = TRUE), 3), nsmall = 3), JOB_EXPER),    )
nacounts <- count_missing(new_data3)
hasNA = which(nacounts > 0)
nacounts[hasNA]
##       UG_LOCATION  UG_MINING_METHOD      MINING_EQUIP    EQUIP_MFR_NAME 
##              1169              1265               921               923 
##        OCCUPATION          ACTIVITY     INJURY_SOURCE     NATURE_INJURY 
##               182               180               180               181 
##     INJ_BODY_PART   SCHEDULE_CHARGE     DAYS_RESTRICT         DAYS_LOST 
##               181               601               483               376 
##        TRANS_TERM RETURN_TO_WORK_DT      IMMED_NOTIFY   INVEST_BEGIN_DT 
##               203               249              1056              1396

Now we process the Categorical values and replace the NA values with a variable which makes sense

library(dplyr)
new_data4 <-
  mutate(new_data3,
         UG_LOCATION = ifelse(is.na(UG_LOCATION), "UNKNOWN_UG_LOCATION" , UG_LOCATION),
         UG_MINING_METHOD = ifelse(is.na(UG_MINING_METHOD), "UNKNOWN_UG_MINING_METHOD" , UG_MINING_METHOD),
         MINING_EQUIP = ifelse(is.na(MINING_EQUIP), "INVALID_EQUIPMENT" , MINING_EQUIP),
         EQUIP_MFR_NAME = ifelse(is.na(EQUIP_MFR_NAME), "INVALID_MFR_CODE", EQUIP_MFR_NAME),
         OCCUPATION = ifelse(is.na(OCCUPATION), "INVALID_OCCUPATION", OCCUPATION),
         ACTIVITY = ifelse(is.na(ACTIVITY), "INVALID_ACTIVITY", ACTIVITY),
         INJURY_SOURCE = ifelse(is.na(INJURY_SOURCE), "INVALID_INJURY", INJURY_SOURCE),
         NATURE_INJURY = ifelse(is.na(NATURE_INJURY), "INVALID_NATURE_INJURY", NATURE_INJURY),
         INJ_BODY_PART = ifelse(is.na(INJ_BODY_PART), "INVALID_INJ_BODY_PART", INJ_BODY_PART),
         SCHEDULE_CHARGE = ifelse(is.na(SCHEDULE_CHARGE), format(round(mean(new_data3$SCHEDULE_CHARGE, na.rm = TRUE), 3), nsmall = 3), SCHEDULE_CHARGE),
         DAYS_RESTRICT = ifelse(is.na(DAYS_RESTRICT), 0, DAYS_RESTRICT), 
         # Since in the data dictionary or the details of the variables it is clearly stated
         # that if there are null values in this case then that means the DAYS_RESTRICT would be 0 for the null values which do not apply
         DAYS_LOST = ifelse(is.na(DAYS_LOST), 0, DAYS_LOST),
         # Since in the data dictionary or the details of the variables it is clearly stated
         # that if there are null values in this case then that means the DAYS_LOST would be 0 for the null values which do not apply
         TRANS_TERM = ifelse(is.na(TRANS_TERM), "INVALID_TRANS_TERM", TRANS_TERM),
         RETURN_TO_WORK_DT = ifelse(is.na(RETURN_TO_WORK_DT), "INVALID_RETURN_TO_WORK_DT", RETURN_TO_WORK_DT),
         IMMED_NOTIFY = ifelse(is.na(IMMED_NOTIFY), "INVALID_NOTIFICATION", IMMED_NOTIFY),
         INVEST_BEGIN_DT = ifelse(is.na(INVEST_BEGIN_DT), "INVALID_INVEST_BEGIN_DT", INVEST_BEGIN_DT),
    )
#CONVERTING THE RETURN_TO_WORK_DT from date format to year format
nacounts <- count_missing(new_data4)
hasNA = which(nacounts > 0)
nacounts[hasNA]
## named integer(0)

This point is at the ending of the DATA CLEANING AND PROCESSING stage now we visualise with the variables we are left with after Cleaning and Processing the data

Exploratory Data Analysis with one variable

Bar charts

Firstly we try to find the frequency of the DEGREE_INJURY column to check the severity of the injury of the accident using a Sorted Horizontal Bar Charts

library(ggplot2)
#Over here we see that there needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables
statesums <- table(new_data4$DEGREE_INJURY)
statef <- as.data.frame(statesums)
colnames(statef) <- c("DEGREE_INJURY", "count")
 statef <- transform(statef,
          DEGREE_INJURY=reorder(DEGREE_INJURY, -desc(count)))


g <- ggplot(statef, 
            aes(x = DEGREE_INJURY, y = count, label = count, fill="maroon"))+ coord_flip() +  geom_col() +  geom_text(nudge_y = -1, color = "black") +   ggtitle("DEGREE_INJURY against count") +   xlab("DEGREE_INJURY") + ylab("count") + 
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(hjust = 0.8)) 
g

In the Plot above we get to see that the severity of the injuries or the degree of injuries where the most of all the categories the employees fell in the category where they had Days away from work only, “No days away from work and no restricted activities” and “Days restricted activity only” which means employees mostly were away from work and some even did not get days off and action restrictions in the work area and some of them had days restricted activity only. Other categories include “Accidents only” and “Days away from work and restricted actions” which means they faced just the accidents and days off from work with limited activities in the work environment. Other minor categories included “Fatality”,“Occupational Illness”.

Next we will explore the UG_LOCATION column to check where the accident occured or where the employee was when the accident occured

library(ggplot2)

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables

statesums <- table(new_data4$UG_LOCATION)
statef <- as.data.frame(statesums)
colnames(statef) <- c("UG_LOCATION", "count")
 statef <- transform(statef,
          UG_LOCATION=reorder(UG_LOCATION, -desc(count)))

g <- ggplot(statef, 
            aes(x = UG_LOCATION, y = count, label = count, fill="maroon"))+ coord_flip() +

  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("UG_LOCATION against count") + 
  xlab("UG_LOCATION") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(hjust = 0.8)) 
g

In the plot above we get get to see there is a suspiciously high count for UNKNOWN_UG_LOCATION meaning the underground location is unknown to us. But if we see the corresponding SUBUNIT column we can deduce something so we plot the barchart for the subunits to see which subunit the accident occured

library(ggplot2)
#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables

statesums <- table(new_data4$SUBUNIT)
statef <- as.data.frame(statesums)
colnames(statef) <- c("SUBUNIT", "count")
 statef <- transform(statef,
          SUBUNIT=reorder(SUBUNIT, -desc(count)))



g <- ggplot(statef, 
            aes(x = SUBUNIT, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("SUBUNIT against count") + 
  xlab("SUBUNIT") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(hjust = 0.8)) 
g

Although we see that the location might be in the underground but after seeing the subunit we can deduce that although the location might be unknown but for subunits most of the accident occured were mainly in the Strip,Quarry,Open Pit And Mill Operation/Preparation Plant and in the Underground subunit.

Next we can explore the UG_MINING_METHOD or the Underground mining method

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables

statesums <- table(new_data4$UG_MINING_METHOD)
statef <- as.data.frame(statesums)
colnames(statef) <- c("UG_MINING_METHOD", "count")
 statef <- transform(statef,
          UG_MINING_METHOD=reorder(UG_MINING_METHOD, -desc(count)))


g <- ggplot(statef, 
            
            aes(x = UG_MINING_METHOD, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("UG_MINING_METHOD against count") + 
  xlab("UG_MINING_METHOD") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(hjust = 0.8)) 
g

We have seem to come across anomalies where it shows that the Mining Methods are missing for the maximum counts so the mining methods are unknown for most of the cases maybe if we explore other columns we maybe able to deduce what this column is trying to tell us

Now we see the MINING_EQUIP column to see what Mining Equipments were there in the accident

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables
df <- as.data.frame(table(new_data4$MINING_EQUIP))
df <- df[order(-df$Freq),]
dfshort <- df[1:20,]

statesums <- dfshort
statef <- statesums
colnames(statef) <- c("MINING_EQUIP", "count")
 statef <- transform(statef,
          MINING_EQUIP=reorder(MINING_EQUIP, -desc(count)))


g <- ggplot(statef, 
            aes(x = MINING_EQUIP, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("MINING_EQUIP against count") + 
  xlab("MINING_EQUIP") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(hjust = 0.7)) 
g

Since the number of MINING_EQUIPs were huge we had to cut it down to top 20 values in order to plot a clear bar chart Unfortunately there are so many categories that horizontal barcharts might not be appropriate but we can still see that most of the equipments during the accident were Invalid Equipments as it was mentioned that any missing values in the column would be considered as an invalid equipment according to the data dictionary.

So up until this stage we found out that most of the accidents occured in unknow Underground locations with unknown mining methods with invalid equipments.

Next we explore the CLASSIFICATION column

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables
df <- as.data.frame(table(new_data4$OCCUPATION))
df <- df[order(-df$Freq),]
dfshort <- df[1:20,]

statesums <- dfshort
statef <- statesums
colnames(statef) <- c("CLASSIFICATION", "count")
 statef <- transform(statef,
          CLASSIFICATION=reorder(CLASSIFICATION, -desc(count)))


g <- ggplot(statef, 
            aes(x = CLASSIFICATION, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("CLASSIFICATION against count") + 
  xlab("CLASSIFICATION") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(size = rel(0.6) ))+
  theme(axis.text.y=element_text(size=rel(0.6)))
g

Since the number of CLASSIFICATION was huge we had to reduce it down to first 20 values in order to perform decent plotting We can deduce that most of the Classification falls on Handling of materials, Slip or fall of person, Non powered handtools, machinery, fall of roof or back and power haulage. So most of the Accidents happened due to Handling of materials

We can be more clear if we explore the ACCIDENT_TYPE column

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables

statesums <- table(new_data4$ACCIDENT_TYPE)
statef <- as.data.frame(statesums)
colnames(statef) <- c("ACCIDENT_TYPE", "count")
 statef <- transform(statef,
          ACCIDENT_TYPE=reorder(ACCIDENT_TYPE, -desc(count)))


g <- ggplot(statef, 
            aes(x = ACCIDENT_TYPE, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("ACCIDENT_TYPE against count") + 
  xlab("ACCIDENT_TYPE") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(size = rel(0.7) ))+
  theme(axis.text.y=element_text(size=rel(0.6)))
g

In the plot above we see that most of the Accident types from the ACCIDENT_TYPE columns is the N.E.C classified Accidents meaning when an accident is classified as bodily reaction and exertion, n.e.c., it refers to an accident that resulted from a combination of free bodily motion and excessive physical effort or action. It is a type of bodily reaction and exertion accident. However, it is a bodily reaction and exertion accident that does not fit or fall into a category. It is classified as n.e.c.. This means it is not elsewhere classified. Other accident types include the ones without injuries and over exertion in lifting objects. Other down the row we see accidents such as getting struck by a stationary objects and caught in, under and below moving objects.

Which brings a question whether the employees who faced the most of the accidents were inexperienced or experienced

So now we explore the column which gives us the information about the employee’s occupation from the OCCUPATION column

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables
df <- as.data.frame(table(new_data4$OCCUPATION))
df <- df[order(-df$Freq),]
dfshort <- df[1:20,]
statesums <- dfshort
statef <- statesums
colnames(statef) <- c("OCCUPATION", "count")
 statef <- transform(statef,
          OCCUPATION=reorder(OCCUPATION, -desc(count)))

g <- ggplot(statef, 
            aes(x = OCCUPATION, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("OCCUPATION against count") + 
  xlab("OCCUPATION") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(size = rel(0.7) ))+
  theme(axis.text.y=element_text(size=rel(0.6)))
g

Since the number of OCCUPATIONs were huge we had to cut it down to first 20 values in order to do comparison In this plot above we see that most of the occupation from the Bar chart of the OCCUPATION are from the group Maintanence man, Mechanic, Serviceman/Repair,Boilermaker,Fueler, Tire tech, Field Service tech and the second most occuring group is an Invalid group which raises the question, What Occupation were they invovled with? Other columns included Warehouse Bagger, Store Keeper, Packager, Fabricator, Cleaning Plant Operator

Next we explore the activities that they were doing at that time so we check the ACTIVITIES column

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables
df <- as.data.frame(table(new_data4$ACTIVITY))
df <- df[order(-df$Freq),]
dfshort <- df[1:20,]

statesums <- dfshort
statef <- statesums
colnames(statef) <- c("ACTIVITY", "count")
 statef <- transform(statef,
          ACTIVITY=reorder(ACTIVITY, -desc(count)))

g <- ggplot(statef, 
            aes(x = ACTIVITY, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("ACTIVITY against count") + 
  xlab("ACTIVITY") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(size = rel(0.7) ))+
  theme(axis.text.y=element_text(size=rel(0.6)))
g

Since the number of ACTIVITY was very huge we had to cut it down to first 20 values in order to do comparison We find out from the above plot that most of the activities were Handling Supplies Materials, Machine Maintanence Repairing,Handling Non Powered Tools, Invalid Activities this raises another question whether they were experienced or not ao to further clear our doubts we look at INJURY_SOURCE column

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables
df <- as.data.frame(table(new_data4$INJURY_SOURCE))
df <- df[order(-df$Freq),]
dfshort <- df[1:10,]

 
statesums <- dfshort
statef <- statesums
colnames(statef) <- c("INJURY_SOURCE", "count")
 statef <- transform(statef,
          INJURY_SOURCE=reorder(INJURY_SOURCE, -desc(count)))

g <- ggplot(statef, 
            aes(x = INJURY_SOURCE, y = count, label = count, fill="maroon"))+ coord_flip() +
  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("INJURY_SOURCE against count") + 
  xlab("INJURY_SOURCE") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(size = rel(0.7) ))+
  theme(axis.text.y=element_text(size=rel(0.6)))
g

Since the amount of INJURY_SOURCE is so huge that we took the top 10 of the values and plotted our graph

In the above plot most of the injury happened from metal pipe wire nails, invalid injuries, metal covers and guards, the ground broken rock coal and caving rock coal. Further Single Column investigation is needed to Find what the invalid columns are and also to find a link to Injuries or accidents

#Over here we see that there also needs a reordering of the values in order to find a clearer output
#So we sort the values alphabetically in the increasing order and added default colors to pick all the variables

statesums <- table(new_data4$NATURE_INJURY)
statef <- as.data.frame(statesums)
colnames(statef) <- c("NATURE_INJURY", "count")
 statef <- transform(statef,
          NATURE_INJURY=reorder(NATURE_INJURY, -desc(count)))


g <- ggplot(statef, 
            aes(x = NATURE_INJURY, y = count, label = count, fill="maroon"))+ coord_flip() +

  geom_col() +
  geom_text(nudge_y = -1, color = "black") + 
  ggtitle("NATURE_INJURY against count") + 
  xlab("NATURE_INJURY") + ylab("count") +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(size = rel(0.7) ))+
  theme(axis.text.y=element_text(size=rel(0.6)))
g

in the above plot most of the nature of the injury falls under Open Wounds, Fractured Chip and Bruises and some even have Bruises and there are some that fall into Invalid Injury sources. So it is safe to consider the invalid injury sources has missing since they might not be working there in the mining sectors.

Histogram

We would look into the next column of interest which is DAYS_LOST so for this we will be using a histogram

ggplot(new_data4, aes(x=DAYS_LOST))  + 
geom_histogram(binwidth=10, fill="Gray")+ggtitle("DAYS_LOST against count")+theme(plot.title = element_text(size = rel(0.7) ))

#BINS need to be wider to make it less noisy

We can see in the plot above that there are outliers at 200 to 360 days and one at 450 and one at 500 and most of the concentration is at around 7 days and it is suspiciously very high around this are but a histogram of this plot is not much clear as to what meaning we should find out from this so we can use a Density Plot for this column

library(scales)
##Since this the above code gives a scewed graph we therefore use logarithms
#density plot was positively skewed so I used logarithms
# Basic density plot in ggplot2
ggplot(new_data4, aes(x = DAYS_LOST))  + 
  geom_density() + scale_x_log10(breaks=c(2.5,5,45,360,450,1000),labels=comma) +
  annotation_logticks(sides="bt") + theme(text = element_text(size = 10))+ggtitle("DAYS_LOST against count")+theme(plot.title = element_text(size = rel(0.7) ))
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1119 rows containing non-finite values (stat_density).

We see that there are two peaks where one is at 2.5 days and the other one is at 45 days lost so these are the maximum days that were lost as a consequence to the accident

Exploratory Data Analysis with two variables

Now it is time for the comparison of two or more variables We will look at the columns UG_LOCATION against DEGREE_INJURY to see a relation

Heat maps

ggplot(data = new_data4) +ggtitle("CAL_YR against DEGREE_INJURY")+theme(plot.title = element_text(size = rel(0.7) )) + 
  geom_count(mapping = aes(x=CAL_YR, y=DEGREE_INJURY))+theme(axis.text.x = element_text(size = 5))+theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1)) 

We can clearly state that the DEGREE_INJURY decreases accross the accident years which is CAL_YR since density decreases across the years

Next we take a look at the relationship between DEGREE_INJURY and JOB_EXPER through a scatter plot

#degree injury against job experience
ggplot(data= new_data4, mapping = aes(JOB_EXPER,DEGREE_INJURY)) + ggtitle("JOB_EXPER against DEGREE_INJURY")+theme(plot.title = element_text(size = rel(0.7) )) + geom_point(aes(as.numeric(JOB_EXPER),DEGREE_INJURY,color= DEGREE_INJURY), show.legend = FALSE)

We can clearly state that as JOB_EXPER or job experience increases the injury degree or DEGREE_INJURY decreases

Density plots and Heat maps

require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#shift time against accident time
g1<-ggplot(data= new_data4, mapping = aes(SHIFT_BEGIN_TIME,ACCIDENT_TIME)) + ggtitle("SHIFT_BEGIN_TIME against ACCIDENT_TIME") +theme(plot.title = element_text(size = rel(0.7) ))+ geom_point(aes(SHIFT_BEGIN_TIME,ACCIDENT_TIME,color= SHIFT_BEGIN_TIME), show.legend = FALSE)

g2<-ggplot(data = new_data4) +
  geom_count(mapping = aes(x=SHIFT_BEGIN_TIME, y=ACCIDENT_TIME))+theme(axis.text.x = element_text(size = 5))+theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1))+theme(axis.text.y = element_text(angle = 90, vjust = 1, hjust = 1))

grid.arrange(g1, g2, nrow=2)

The plot for both the graphs. The heat map and density plot shows that the accidents occured more between shifts of 6000 which is 6am and 7000 which is 7am

ggplot(data = new_data4) +
  geom_count(mapping = aes(x=ACCIDENT_TIME, y=OCCUPATION))+theme(axis.text.x = element_text(size = 5))+theme(axis.text.y = element_text(size = 5))+theme(axis.text.y = element_text(angle = 0, vjust = 1, hjust = 1))+ggtitle("ACCIDENT_TIME against OCCUPATION") +theme(plot.title = element_text(size = rel(0.7) ))

Referring to the plot above the Occupation of Maintenance man,Mechanic, Repair/Serviceman,Boilermaker etc has the most dense ACCIDENT_TIME between approximately 7500 and 1500 of the 24 hour clock of the ACCIDENT_TIME axis and we also get to see that the INVALID_OCCUPATION of the OCCUPATION axis had uniform accident in the ACCIDENT_TIME axis throughout from 0000 to 2400 with the same density of the heat map. We can assume that the INVALID_OCCUPATION falls well underneath occupations which include excessive physical work like a movie set where stunt people practice their movie shots.

End of the Exploratory Data Analysis